import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from statsmodels.stats.multitest import multipletests
from scipy.ndimage.filters import gaussian_filter as gf
from scipy.stats import truncnorm
from scipy.optimize import curve_fit
from tools import *
%matplotlib inline
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['figure.dpi'] = 150
#plt.rcParams['figure.figsize'] = 5, 4
plt.rcParams['font.size'] = 8
resolution = 10000
n_bins = 791
input_fo='/home/lv/working_data/Streptomyces/18_D1186_III/2020_09/SCN_good_corrected/'
#os.listdir(input_fo)
batches=['L216','L263','LBM5']
counts={t:np.genfromtxt(input_fo+'%s_2M_corr.txt'%t) for t in batches}
comp_bounds=[128,561] #from np.argwhere(frontier_index[batch][1]>0.045), see below
gaussian_width=1 #to mitigate noise in generate_frontiers
max_s=60 #to mitigate noise in compute_frontier_indexes
domain_fo='figures/domains_gw=%.1f_max_s=%d/'%(gaussian_width,max_s)
if not os.path.exists(domain_fo): os.mkdir(domain_fo)
#mininum positive values to compute pseudo_counts
min_positive_counts={t:np.min(counts[t][counts[t]>0]) for t in batches}
max_counts={t:np.max(counts[t]) for t in batches}
#pseudo_counts={t:min_positive_counts[t]/2 for t in batches}
pseudo_counts={t:max_counts[t]/20 for t in batches}
frontiers_matrix = {t:generate_frontiers(np.log(counts[t]+pseudo_counts[t]),w=gaussian_width) for t in batches}
fig, axes = plt.subplots(figsize=(6, 12), ncols=2, nrows=3, tight_layout=True)
for i,t in enumerate(batches):
axes[i][0].imshow(
counts[t],
norm=colors.LogNorm(),
vmin=0.0005, vmax=0.1, cmap='bone_r')
axes[i][0].set_title('HiC heat map - %s'%t)
axes[i][1].imshow(
frontiers_matrix[t][0],
cmap="Greys",vmax=0.15);#,
# vmin=0,
# vmax=0.001);
axes[i][1].set_title('Frontier signal - %s'%t);
fig.savefig('results/frontier_maps.pdf', transparent=True)
We circularize the chromosome. In effect, it means:
frontier_index = {t:compute_frontier_indexes(frontiers_matrix[t],max_s=max_s) for t in batches}
batch_axes=['L263','LBM5']
fig, axes = plt.subplots(figsize=(8, 6), nrows=2, tight_layout=True)
loci=[i for i in range(len(frontier_index[batch_axes[0]][0]))]
for iplot in [0,1]:
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
axes[iplot].plot(loci,[frontier_index[batch_axes[iplot]][i][l] for l in loci], '%s+-'%c,label=lab);
axes[iplot].set_title(batch_axes[iplot])
axes[iplot].set_ylim([0,7])
#for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
# axes[1].plot(loci,[frontier_index[batch][i][l] for l in loci], '%s+-'%c,label=lab);
#axes[1].set_xlim([0,200])
#axes[0].set_title(batch)
axes[1].set_xlabel('locus=(HiC) matrix index');
for i in [0, 1]:
axes[i].set_ylabel('Frontier index');
axes[i].legend(fontsize="medium");
fig.savefig('results/profiles.pdf', transparent=True)
#identification of all peaks with their value
peaks = {t:[find_peaks(frontier_index[t][i]) for i in range(2)] for t in batches}
#extract peaks values depending on compartments to perform stat analysis
peak_values={}
peak_values['core']={t:np.array([peaks[t][0][1][ip] for ip, p in enumerate(peaks[t][0][0]) if comp_bounds[0] <= p <= comp_bounds[1]]+
[peaks[t][1][1][ip] for ip, p in enumerate(peaks[t][1][0]) if comp_bounds[0] <= p <= comp_bounds[1]])
for t in batches}
peak_values['borders']={t:np.array([peaks[t][0][1][ip] for ip, p in enumerate(peaks[t][0][0]) if 0<p<comp_bounds[0] or p > comp_bounds[1]]+
[peaks[t][1][1][ip] for ip, p in enumerate(peaks[t][1][0]) if 0<p<comp_bounds[0] or p > comp_bounds[1]])
for t in batches}
#median (med) and median absolute deviaiton (mad)
#rem: compare to std, mad is a more robust estimate of the "variance of the null model"
med_peaks={comp:{t:np.median(peak_values[comp][t]) for t in batches} for comp in ['core','borders']}
mad_peaks={comp:{t:np.median(np.abs(peak_values[comp][t]-med_peaks[comp][t])) for t in batches} for comp in ['core','borders']}
rho=1.48
k_sigma=2
thresh={comp:{t:med_peaks[comp][t]+k_sigma*rho*mad_peaks[comp][t] for t in batches} for comp in ['core','borders']}
plt.figure(figsize=(6,4),tight_layout=True)
bins=np.arange(0,6,0.03)
comp='borders'
for it,batch in enumerate(batches):
plt.subplot(2,2,it+1)
plt.hist(peak_values[comp][batch],bins=bins,label=comp+'-'+batch);
plt.axvline(med_peaks[comp][batch],color='k',linestyle='dashed')
plt.axvline(thresh[comp][batch],color='r',linestyle='dashed')
plt.legend(loc=1)
plt.figure(figsize=(6,4),tight_layout=True)
bins=np.arange(0,6,0.03)
comp='core'
for it,batch in enumerate(batches):
plt.subplot(2,2,it+1)
plt.hist(peak_values[comp][batch],bins=bins,label=comp+'-'+batch);
plt.axvline(med_peaks[comp][batch],color='k',linestyle='dashed')
plt.axvline(thresh[comp][batch],color='r',linestyle='dashed')
plt.legend(loc=1)
Rem: here, significant = above an arbitrarly, yet reasonable thresholds
sign_peaks={} # [batch][i_FI][0]=loci, [1]=values
for batch in batches:
sign_peaks[batch]=[[],[]]
for i_FI in [0,1]:
sign_peaks[batch][i_FI]=[[peaks[batch][i_FI][0][i] for i in range(len(peaks[batch][i_FI][0]))
if ((2<=peaks[batch][i_FI][0][i]<=comp_bounds[0] and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]) or
(comp_bounds[0]<peaks[batch][i_FI][0][i]<=comp_bounds[1] and peaks[batch][i_FI][1][i]>=thresh['core'][batch]) or
(comp_bounds[1]<peaks[batch][i_FI][0][i]<=n_bins-2 and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]))],
[peaks[batch][i_FI][1][i] for i in range(len(peaks[batch][i_FI][0]))
if ((2<=peaks[batch][i_FI][0][i]<=comp_bounds[0] and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]) or
(comp_bounds[0]<peaks[batch][i_FI][0][i]<=comp_bounds[1] and peaks[batch][i_FI][1][i]>=thresh['core'][batch]) or
(comp_bounds[1]<peaks[batch][i_FI][0][i]<=n_bins-2 and peaks[batch][i_FI][1][i]>=thresh['borders'][batch]))]
]
for batch in batches:
print("%s"%batch)
print("Number of upstream peaks:\n\tborders:%d\tcore:%d"%(len([p for p in sign_peaks[batch][0][0] if p <= comp_bounds[0] or p>=comp_bounds[1]]),
len([p for p in sign_peaks[batch][0][0] if p > comp_bounds[0] and p<comp_bounds[1]])))
print("Number of downstream peaks:\n\tborders:%d\tcore:%d"%(len([p for p in sign_peaks[batch][1][0] if p <= comp_bounds[0] or p>=comp_bounds[1]]),
len([p for p in sign_peaks[batch][1][0] if p > comp_bounds[0] and p<comp_bounds[1]])))
print('\n')
batch='L263'
fig, axes = plt.subplots(figsize=(9, 6), ncols=3, nrows=2,tight_layout=True)
for irow in range(2):
axes[irow, 0].matshow(
counts[batch],
norm=colors.LogNorm(),
vmin=0.0005, vmax=0.1, cmap='bone_r')
axes[irow, 1].plot(frontier_index[batch][0], "C1-",label="$FI_{up}$")
axes[irow, 1].plot(frontier_index[batch][1], "C2-",label="$FI_{down}$")
axes[irow, 1].axhline(thresh['borders'][batch],color='k',linestyle='dashed',label='thresh-borders')
axes[irow, 1].axhline(thresh['core'][batch],color='k',linestyle='dotted',label='thresh-core')
axes[irow, 2].matshow(frontiers_matrix[batch][0],
cmap=plt.get_cmap('Greys'))#, vmin=0, vmax=0.001);
[axes[irow, 2].plot([i, len(counts[batch])], [i, i], color="C1", alpha=0.7)
for i in sign_peaks[batch][0][0]];
[axes[irow, 2].plot([i, i], [i,len(counts[batch])], color="C1", alpha=0.7)
for i in sign_peaks[batch][0][0]];
[axes[irow, 2].plot([0, i], [i, i], color="C2", alpha=0.7)
for i in sign_peaks[batch][1][0]];
[axes[irow, 2].plot([i, i], [0, i], color="C2", alpha=0.7)
for i in sign_peaks[batch][1][0]];
# Set titles on the plots of the first row
if irow == 0:
axes[irow, 0].set_title('HiC', pad=0.5)
axes[irow, 1].set_title('Frontier index (FI)')
axes[irow, 2].set_title('Frontier matrix and FI peaks',pad=0.5)
if irow == 0:
for j in [0,2]:
axes[irow, j].set_xlim([0,n_bins/2])
axes[irow][j].set_ylim([n_bins/2,0])
axes[irow, j].xaxis.set_ticks_position('bottom')
axes[irow, 1].set_xlim([0, n_bins/2])
axes[irow, 1].set_ylim([0, 6])
axes[irow, 1].legend(loc=2,fontsize='large')
else:
for j in [0, 2]:
axes[irow, j].set_xlim([n_bins/2, n_bins])
axes[irow, j].set_ylim([n_bins, n_bins/2])
axes[irow, j].xaxis.set_ticks_position('bottom')
axes[irow, 1].set_xlim([n_bins/2, n_bins])
axes[irow, 1].set_ylim([0, 6])
axes[irow, 1].legend(loc=1,fontsize='large')
fig.savefig('results/example_frontier_peaks_L263.pdf')
batch='LBM5'
fig, axes = plt.subplots(figsize=(9, 6), ncols=3, nrows=2,tight_layout=True)
for irow in range(2):
axes[irow, 0].matshow(
counts[batch],
norm=colors.LogNorm(),
vmin=0.0005, vmax=0.1, cmap='bone_r')
axes[irow, 1].plot(frontier_index[batch][0], "C1-",label="$FI_{up}$")
axes[irow, 1].plot(frontier_index[batch][1], "C2-",label="$FI_{down}$")
if irow==0:
axes[irow, 1].axhline(thresh['borders'][batch],color='k',linestyle='dashed')
else:
axes[irow, 1].axhline(thresh['core'][batch],color='k',linestyle='dashed')
axes[irow, 2].matshow(frontiers_matrix[batch][0],
cmap=plt.get_cmap('Greys'))#, vmin=0, vmax=0.001);
[axes[irow, 2].plot([i, len(counts[batch])], [i, i], color="C1", alpha=0.7)
for i in sign_peaks[batch][0][0]];
[axes[irow, 2].plot([i, i], [i,len(counts[batch])], color="C1", alpha=0.7)
for i in sign_peaks[batch][0][0]];
[axes[irow, 2].plot([0, i], [i, i], color="C2", alpha=0.7)
for i in sign_peaks[batch][1][0]];
[axes[irow, 2].plot([i, i], [0, i], color="C2", alpha=0.7)
for i in sign_peaks[batch][1][0]];
# Set titles on the plots of the first row
if irow == 0:
axes[irow, 0].set_title('HiC', pad=0.5)
axes[irow, 1].set_title('Frontier index (FI)')
axes[irow, 2].set_title('Frontier matrix and FI peaks',pad=0.5)
if irow == 0:
for j in [0,2]:
axes[irow, j].set_xlim([0,n_bins/2])
axes[irow][j].set_ylim([n_bins/2,0])
axes[irow, j].xaxis.set_ticks_position('bottom')
axes[irow, 1].set_xlim([0, n_bins/2])
axes[irow, 1].set_ylim([0, 6])
axes[irow, 1].legend(loc=2,fontsize='large')
else:
for j in [0, 2]:
axes[irow, j].set_xlim([n_bins/2, n_bins])
axes[irow, j].set_ylim([n_bins, n_bins/2])
axes[irow, j].xaxis.set_ticks_position('bottom')
axes[irow, 1].set_xlim([n_bins/2, n_bins])
axes[irow, 1].set_ylim([0, 6])
axes[irow, 1].legend(loc=1,fontsize='large')
fig.savefig('results/example_frontier_peaks_LBM5.pdf')
data_fo='results/gw=%.1f_max_s=%d/'%(gaussian_width,max_s)
if not os.path.exists(data_fo): os.mkdir(data_fo)
for batch in batches:
#upstream
with open(data_fo+'%s_upstream_peaks.txt'%batch,'w') as out:
# for i in sorted([i for i in range(len(sign_peaks[batch][0][0]))],key=lambda i:-sign_peaks[batch][0][1][i]):
for i in range(len(sign_peaks[batch][0][0])): out.write("%d\t%.5f\n"%(sign_peaks[batch][0][0][i]+1, sign_peaks[batch][0][1][i]))
#downstream
with open(data_fo+'%s_downstream_peaks.txt'%batch,'w') as out:
# for i in sorted([i for i in range(len(sign_peaks[batch][1][0]))],key=lambda i:-sign_peaks[batch][1][1][i]):
for i in range(len(sign_peaks[batch][1][0])): out.write("%d\t%.5f\n"%(sign_peaks[batch][1][0][i]+1, sign_peaks[batch][1][1][i]))